Load data, calculate variables required, and calculate a dataset which allows you to calculate trend lines over time. That is, I infer which individuals were in the workforce in any one year based on their hire date and tenure.
# Loading and calculating data
data <- read.csv('GCR.csv') %>%
dplyr::mutate(gender = factor(gender, levels = c("M", "F")),
gradeAtHire = as.ordered(gradeAtHire),
age = ageAtHire + tenure,
gend_dept = paste(gender, department, sep = "_"),
leaveYear = hireYear + tenure,
term = ifelse(reason == "terminated", 1, 0))
# str(data)
summary(data)
## id department ageAtHire hireYear
## GCR1 : 1 Administrative:2534 Min. :18.0 Min. :1980
## GCR10 : 1 Operational :2466 1st Qu.:27.0 1st Qu.:1990
## GCR100 : 1 Median :36.0 Median :1999
## GCR1000: 1 Mean :36.1 Mean :1999
## GCR1001: 1 3rd Qu.:45.0 3rd Qu.:2009
## GCR1002: 1 Max. :59.0 Max. :2018
## (Other):4994
## tenure reason gender gradeAtHire current
## Min. : 0.000 resigned :3221 M:2810 1:1764 Mode :logical
## 1st Qu.: 1.000 retired : 269 F:2190 2:2002 FALSE:3952
## Median : 3.000 terminated: 462 3: 775 TRUE :1048
## Mean : 5.759 NA's :1048 4: 394
## 3rd Qu.: 8.000 5: 65
## Max. :38.000
##
## age gend_dept leaveYear term
## Min. :18.00 Length:5000 Min. :1980 Min. :0.0000
## 1st Qu.:32.00 Class :character 1st Qu.:1995 1st Qu.:0.0000
## Median :41.00 Mode :character Median :2006 Median :0.0000
## Mean :41.85 Mean :2005 Mean :0.1169
## 3rd Qu.:51.00 3rd Qu.:2017 3rd Qu.:0.0000
## Max. :95.00 Max. :2018 Max. :1.0000
## NA's :1048
# Workforce over time: ----
all_years <- expand.grid(year = unique(c(data$hireYear, data$leaveYear)),
id = unique(data$id))
data_timeline <- dplyr::full_join(data, all_years, by = "id") %>%
dplyr::filter(hireYear <= year & leaveYear >= year) %>%
dplyr::group_by(id) %>%
dplyr::mutate(reason = ifelse(year != max(year), NA, as.character(reason)),
current = ifelse(year != max(year), TRUE, FALSE),
term = ifelse(year != max(year), NA, as.numeric(term)),
new = hireYear == year)
# Proportion of women in the workforce any year: ----
workforce_prop_timeline <- data_timeline %>%
dplyr::group_by(year, gender) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop_workforce = n / sum(n)) %>%
dplyr::filter(gender == "F") %>%
dplyr::select(year, prop_workforce)
# People who have left: ----
data_left <- data[!data$current, ]
We assume that the business was founded in 1980.
I explored univariate and bivariate distributions of variables I hypothesise may be related to each other and to gender using some basic visualisations. If you are interested, click the buttons below.
Click for visualisations of the current workforce
data_current <- data[data$current, ]
summary(data_current)
## id department ageAtHire hireYear
## GCR1 : 1 Administrative:409 Min. :18.00 Min. :1980
## GCR1002: 1 Operational :639 1st Qu.:27.00 1st Qu.:2003
## GCR1004: 1 Median :36.00 Median :2010
## GCR1017: 1 Mean :35.73 Mean :2008
## GCR1019: 1 3rd Qu.:44.00 3rd Qu.:2015
## GCR102 : 1 Max. :59.00 Max. :2018
## (Other):1042
## tenure reason gender gradeAtHire current
## Min. : 0.00 resigned : 0 M:679 1:396 Mode:logical
## 1st Qu.: 3.00 retired : 0 F:369 2:424 TRUE:1048
## Median : 8.00 terminated: 0 3:150
## Mean :10.32 NA's :1048 4: 71
## 3rd Qu.:15.25 5: 7
## Max. :38.00
##
## age gend_dept leaveYear term
## Min. :18.00 Length:1048 Min. :2018 Min. : NA
## 1st Qu.:36.00 Class :character 1st Qu.:2018 1st Qu.: NA
## Median :46.00 Mode :character Median :2018 Median : NA
## Mean :46.05 Mean :2018 Mean :NaN
## 3rd Qu.:55.00 3rd Qu.:2018 3rd Qu.: NA
## Max. :95.00 Max. :2018 Max. : NA
## NA's :1048
# Age by gender
ggplot(data_current, aes(x = gender, y = age))+
geom_violin()+
BIT_theme
# Tenure by gender
ggplot(data_current, aes(x = gender, y = tenure))+
geom_violin()+
BIT_theme
# Department by gender
ggplot(data_current, aes(x = gender, y = department))+
geom_bin2d()+
BIT_theme
# Grade at hire by gender
ggplot(data_current, aes(x = gender, y = gradeAtHire))+
geom_bin2d()+
BIT_theme
# Tenure by department
ggplot(data_current, aes(x = department, y = tenure))+
geom_boxplot()+
BIT_theme
# Age by department
ggplot(data_current, aes(x = department, y = age))+
geom_violin()+
BIT_theme
# Age by gender and department
ggplot(data_current, aes(x = gend_dept, y = tenure))+
geom_violin()+
BIT_theme
# Mainly the administrative dept has much less experienced women than men, much more similar in operations
ggplot(data_current, aes(x = gend_dept, y = age))+
geom_violin()+
BIT_theme
ggplot(data_current, aes(x = gend_dept, y = ageAtHire))+
geom_violin()+
BIT_theme
Click for visualisations of the previous leavers
# Were there peak leaving years?
hist(data_left$leaveYear, breaks = length(unique(data_left$leaveYear)))
# There were some spikes: 1986, 2002, 2008/9
# Were women overrepresented in those years?
data_years <- data_left %>%
dplyr::group_by(leaveYear, gender) %>%
dplyr::summarise(n = n()) %>%
tidyr::spread(gender, n) %>%
dplyr::mutate(F_prop = `F` / (`F` + `M`))
ggplot(data_years, aes(x = leaveYear, y = F_prop))+
geom_bar(stat = "identity")
# Not noticeably more women leaving in those years as a proportion of everyone leaving that year
# Age by gender
ggplot(data_left, aes(x = gender, y = age))+
geom_violin()+
BIT_theme
# Tenure by gender
ggplot(data_left, aes(x = gender, y = tenure))+
geom_violin()+
BIT_theme
# Department by gender
ggplot(data_left, aes(x = gender, y = department))+
geom_bin2d()+
BIT_theme
# Grade at hire by gender
ggplot(data_left, aes(x = gender, y = gradeAtHire))+
geom_bin2d()+
BIT_theme
# Tenure by department
ggplot(data_left, aes(x = department, y = tenure))+
geom_boxplot()+
BIT_theme
# Age by department
ggplot(data_left, aes(x = department, y = age))+
geom_violin()+
BIT_theme
# Age by gender and department
ggplot(data_left, aes(x = gend_dept, y = tenure))+
geom_violin()+
BIT_theme
# Mainly the administrative dept has much less experienced women than men, much more similar in operations
ggplot(data_left, aes(x = gend_dept, y = age))+
geom_violin()+
BIT_theme
ggplot(data_left, aes(x = gend_dept, y = ageAtHire))+
geom_violin()+
BIT_theme
Calculate the proportion of women in each section of the workforce (department) in a given year, and compare that to the overall proportion of women in the whole workforce.
data_dept_timeline <- data_timeline %>%
dplyr::group_by(year, department, gender) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop = n / sum(n)) %>%
dplyr::filter(gender == "F") %>%
left_join(., workforce_prop_timeline, by = "year")
ggplot(data_dept_timeline, aes(x = year, y = prop, colour = department))+
geom_line()+
geom_smooth(alpha = 0.25)+
geom_line(aes(x = year, y = prop_workforce),
colour = "red")+
labs(title = "Proportion of female workforce over time",
x = "Year",
y = "Proportion of workers that were female",
colour = "Department",
subtitle = "GCR overall proportion in shown in red")+
BIT_theme+
scale_colour_manual(values = strong_palette)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Starting from some basic density plots, I can calculate the proportion of leavers who are women for each separate leaving reason. This tells me whether there are specific reasons that women leave for more often than the general population. Because I saw above that there is a significant difference between the numbers of women between each department, I calculate the reasons separately for each department.
# Reasons for leaving by gender and department
ggplot(data_left, aes(x = gend_dept, y = reason))+
geom_bin2d()+
BIT_theme+
scale_fill_gradientn(colors=strong_palette[1:2])
workforce_dept_prop_timeline <- data_timeline %>%
dplyr::group_by(year, department, gender) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(prop_workforce = n / sum(n)) %>%
dplyr::filter(gender == "F") %>%
dplyr::select(leaveYear = year, department, prop_workforce)
all_reasons <- expand.grid(leaveYear = unique(data_left$leaveYear),
department = unique(data_left$department),
reason = unique(data_left$reason),
gender = unique(data_left$gender))
data_reasons <- data_left %>%
dplyr::group_by(leaveYear, department, reason, gender) %>%
dplyr::summarise(n = n()) %>%
dplyr::ungroup() %>%
dplyr::full_join(., all_reasons,
by = c("leaveYear", "department", "reason", "gender")) %>%
dplyr::mutate(n = ifelse(is.na(n), 0, n)) %>%
dplyr::group_by(leaveYear, department, reason) %>%
dplyr::mutate(tot = sum(n)) %>%
dplyr::ungroup() %>%
dplyr::filter(gender == "F") %>%
dplyr::mutate(prop = ifelse(tot > 0,
n / tot, 0)) %>%
dplyr::left_join(., workforce_dept_prop_timeline,
by = c("leaveYear", "department")) %>%
dplyr::arrange(leaveYear, department, reason, gender)
ggplot(data_reasons, aes(x = leaveYear, y = prop, colour = reason))+
geom_line(alpha = 0.5)+
geom_smooth(se = FALSE)+
geom_line(aes(x = leaveYear, y = prop_workforce),
colour = "red")+
facet_wrap(department~reason)+
labs(title = "Women as a % of all who leave for each reason",
x = "Year of leaving",
y = "Proportion of women",
subtitle = "Department's overall proportion of women shown in red")+
BIT_theme+
scale_colour_manual(values = strong_palette)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Significant association between gender and being terminated?
reason_assoc <- chisq.test(table(data$gender, data$term))
There appears to be a strong association between gender and being terminated, at a Chi squared of 30.52, p < .001.
I want to dig deeper into this association between gender and being terminated. First, I created a set of mosaic plots to explore the bivariate association between termination and other variables. Click the button below to see those.
Click for visualisations bivariate associations with termination
library("graphics")
data_dept <- as.table(as.matrix(table(data_left$department, data_left$term)))
mosaicplot(data_dept, shade = TRUE, las=2, main = "Department")
data_ageH <- as.table(as.matrix(table(data_left$ageAtHire, data_left$term)))
mosaicplot(data_ageH, shade = TRUE, las=2, main = "ageAtHire")
data_ageL <- as.table(as.matrix(table(data_left$age, data_left$term)))
mosaicplot(data_ageL, shade = TRUE, las=2, main = "ageWhenLeft")
data_year <- as.table(as.matrix(table(data_left$hireYear, data_left$term)))
mosaicplot(data_year, shade = TRUE, las=2, main = "hireYear")
data_leave <- as.table(as.matrix(table(data_left$leaveYear, data_left$term)))
mosaicplot(data_leave, shade = TRUE, las=2, main = "leaveYear")
# data_ten <- as.table(as.matrix(data_left$tenure, data_left$term))
# mosaicplot(data_ten, shade = TRUE, las=2, main = "Tenure")
data_gend <- as.table(as.matrix(table(data_left$gender, data_left$term)))
mosaicplot(data_gend, shade = TRUE, las=2, main = "Gender")
data_grade <- as.table(as.matrix(table(data_left$gradeAtHire, data_left$term)))
mosaicplot(data_grade, shade = TRUE, las=2, main = "GradeAtHire")
I then formalised these analyses with an ANOVA analysis on termination.
summary(aov(term ~ gradeAtHire * department * gender * ageAtHire * leaveYear, data = data_left))
## Df Sum Sq Mean Sq
## gradeAtHire 4 0.3 0.0866
## department 1 0.9 0.8950
## gender 1 2.3 2.3373
## ageAtHire 1 0.0 0.0363
## leaveYear 1 0.0 0.0004
## gradeAtHire:department 3 0.1 0.0430
## gradeAtHire:gender 4 0.2 0.0603
## department:gender 1 0.0 0.0108
## gradeAtHire:ageAtHire 4 0.8 0.2000
## department:ageAtHire 1 0.2 0.1544
## gender:ageAtHire 1 0.0 0.0172
## gradeAtHire:leaveYear 4 0.1 0.0138
## department:leaveYear 1 0.1 0.0532
## gender:leaveYear 1 0.0 0.0262
## ageAtHire:leaveYear 1 0.0 0.0147
## gradeAtHire:department:gender 3 0.3 0.0952
## gradeAtHire:department:ageAtHire 3 0.3 0.1069
## gradeAtHire:gender:ageAtHire 4 0.1 0.0313
## department:gender:ageAtHire 1 0.1 0.1044
## gradeAtHire:department:leaveYear 3 0.0 0.0057
## gradeAtHire:gender:leaveYear 4 1.7 0.4151
## department:gender:leaveYear 1 0.0 0.0422
## gradeAtHire:ageAtHire:leaveYear 4 1.2 0.2915
## department:ageAtHire:leaveYear 1 0.0 0.0240
## gender:ageAtHire:leaveYear 1 0.0 0.0001
## gradeAtHire:department:gender:ageAtHire 3 0.1 0.0361
## gradeAtHire:department:gender:leaveYear 3 0.1 0.0284
## gradeAtHire:department:ageAtHire:leaveYear 3 0.4 0.1229
## gradeAtHire:gender:ageAtHire:leaveYear 4 0.1 0.0154
## department:gender:ageAtHire:leaveYear 1 0.0 0.0111
## gradeAtHire:department:gender:ageAtHire:leaveYear 3 0.1 0.0291
## Residuals 3880 398.4 0.1027
## F value Pr(>F)
## gradeAtHire 0.843 0.49759
## department 8.716 0.00317 **
## gender 22.762 1.9e-06 ***
## ageAtHire 0.354 0.55207
## leaveYear 0.004 0.95055
## gradeAtHire:department 0.419 0.73930
## gradeAtHire:gender 0.587 0.67214
## department:gender 0.105 0.74548
## gradeAtHire:ageAtHire 1.947 0.09981 .
## department:ageAtHire 1.504 0.22017
## gender:ageAtHire 0.167 0.68243
## gradeAtHire:leaveYear 0.134 0.96972
## department:leaveYear 0.518 0.47161
## gender:leaveYear 0.255 0.61329
## ageAtHire:leaveYear 0.143 0.70561
## gradeAtHire:department:gender 0.928 0.42648
## gradeAtHire:department:ageAtHire 1.041 0.37323
## gradeAtHire:gender:ageAtHire 0.304 0.87517
## department:gender:ageAtHire 1.017 0.31328
## gradeAtHire:department:leaveYear 0.056 0.98279
## gradeAtHire:gender:leaveYear 4.042 0.00284 **
## department:gender:leaveYear 0.411 0.52149
## gradeAtHire:ageAtHire:leaveYear 2.839 0.02296 *
## department:ageAtHire:leaveYear 0.234 0.62889
## gender:ageAtHire:leaveYear 0.001 0.97748
## gradeAtHire:department:gender:ageAtHire 0.351 0.78835
## gradeAtHire:department:gender:leaveYear 0.277 0.84221
## gradeAtHire:department:ageAtHire:leaveYear 1.197 0.30932
## gradeAtHire:gender:ageAtHire:leaveYear 0.150 0.96304
## department:gender:ageAtHire:leaveYear 0.108 0.74276
## gradeAtHire:department:gender:ageAtHire:leaveYear 0.284 0.83718
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Main effects of department, gender, and hireYear, but 3-way interaction of GradeAtHire, gender and HireYear
And visualised the three-way interaction I observed in the analysis.
data_term_leave <- data_left %>%
dplyr::group_by(leaveYear, department, gender) %>%
dplyr::summarise(leave_left = n(),
leave_terminated = sum(term),
leave_prop_term = sum(term) / n(),
m_ageAtHire = mean(ageAtHire),
sd_ageAtHire = sd(ageAtHire, na.rm = T),
m_ageWhenLeft = mean(age),
sd_ageWhenLeft = sd(age),
m_tenure = mean(tenure),
sd_tenure = sd(tenure),
m_gradeAtHire = mean(as.numeric(gradeAtHire)),
sd_gradeAtHire = sd(as.numeric(gradeAtHire))) %>%
dplyr::select("year" = leaveYear, department, gender, leave_left, leave_terminated, leave_prop_term)
r1 <- ggplot(data_term_leave, aes(x = year, y = leave_prop_term, colour = gender))+
geom_line(alpha = 0.5)+
geom_smooth(se = FALSE)+
facet_wrap(~department)+
labs(title = "Of those who left, the proportion terminated",
x = "Year left",
y = "Proportion terminated",
colour = "Gender",
subtitle = "By department, gender and year they left")+
BIT_theme+
scale_colour_manual(values = strong_palette)
data_term_hire <- data_left %>%
dplyr::group_by(hireYear, department, gender) %>%
dplyr::summarise(hire_left = n(),
hire_terminated = sum(term),
hire_prop_term = sum(term) / n(),
m_ageAtHire = mean(ageAtHire),
sd_ageAtHire = sd(ageAtHire, na.rm = T),
m_ageWhenLeft = mean(age),
sd_ageWhenLeft = sd(age),
m_tenure = mean(tenure),
sd_tenure = sd(tenure),
m_gradeAtHire = mean(as.numeric(gradeAtHire)),
sd_gradeAtHire = sd(as.numeric(gradeAtHire))) %>%
select("year" = hireYear, department, gender, hire_left, hire_terminated, hire_prop_term)
r2 <- ggplot(data_term_hire, aes(x = year, y = hire_prop_term, colour = gender))+
geom_line(alpha = 0.5)+
geom_smooth(se = FALSE)+
facet_wrap(~department)+
labs(title = "Of those who left, the proportion terminated",
x = "Year hired",
y = "Proportion terminated",
subtitle = "By department, gender and year they were hired")+
BIT_theme+
scale_colour_manual(values = strong_palette)
grid.arrange(r1, r2, ncol = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
data_rate <- data_timeline %>%
dplyr::group_by(year, department, gender) %>%
dplyr::summarise(workforce = n(),
left = sum(current != TRUE),
joined = sum(new == TRUE),
terminated = sum(term, na.rm = T)) %>%
dplyr::mutate(joining_rate = joined / workforce,
leaving_rate = left / workforce,
termination_rate = terminated / workforce,
term_leaving_rate = terminated / left) %>%
dplyr::select(year, department, gender,
"Joining rate" = joining_rate,
"Leaving rate" = leaving_rate,
"Termination rate - workforce" = termination_rate,
"Terminations - within leavers" = term_leaving_rate) %>%
tidyr::gather("rate", "value", -year, -department, -gender)
ggplot(data_rate, aes(x = year, y = value, colour = gender))+
geom_line(alpha = 0.5)+
geom_smooth(se = FALSE)+
facet_wrap(department~rate, ncol = 4, scales = "free")+
labs(title = "Rates of joining, leaving and termination in the workforce",
x = "Year",
y = "Rate",
colour = "Gender",
subtitle = "As a proportion of the male or female workforce respectively, or of leavers only")+
BIT_theme+
scale_colour_manual(values = strong_palette)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'